********************************************************************************
** 	TITLE: clean_district_ROA.do
**
**	PROJECT: IGNITE
** 
**  PURPOSE: Clean district ROA Part 1

********************************************************************************
*		set seed 42	
set sortseed 13	
		di "$date"
********************************************************************************
* A. Clean ROA data
********************************************************************************
	
	/*
		Warning: this part of code may take 15 minutes to run
	*/
*****Create global macro of all District ROA batches, minus the initial batch
		global batches 07Feb 08Feb 09Feb 10Feb 13Feb 22Feb ///
			23Feb 24Feb 28feb 01Mar 02Mar 03Mar 06Mar 07Mar 09Mar ///
			10Mar 13Mar 15Mar 20Mar 21Mar 22Mar
	
	
*****Cleaning the District ROA batches, minus the initial batch	
		foreach date in $batches{

			import excel "$input_data/roas_district/ROA_`date'2023_1.xlsx", clear firstrow
			
			replace case_id = subinstr(case_id," ","",.)
			drop if mi(case_id)
			qui missings dropvars, force
			qui duplicates drop
			qui foreach v of varlist *{
				cap replace `v' = "" if `v'=="  "
			}
			qui compress
			tempfile new`date'
			save `new`date'', replace
			
			
		}
		
		
	
*****Appending the clean District ROA batches to the initial batch	
	
		use "$input_data/roas_district/ROA_cleaned_01feb2023.dta", clear

		foreach date in $batches {
			qui append using `new`date'', force
		}
		
preserve
*****Obtaining the cleaned ROA case numbers!
		keep case_id
		export excel "$output_data/D67/ROA_cleaned_casenum_$date.xlsx", ///
			replace firstrow(variables)
restore
*****Creating age variables		
		split age, p("(")
		gen born_year = subinstr(age2,")","",.)
		destring born_year, replace
		gen age_when_filed = substr(age1,1,2)
		destring age_when_filed, replace
		drop age1 age2
		
		gen sentenced_term = subinstr(minimum_term," DAY(S)","",.)
		destring sentenced_term, replace
		
*****Felonies (identifier and number)
		gen D_felony = 0
		gen num_felonies = 0
		foreach v of varlist c_charge_level* {
			replace D_felony = 1 if strpos(lower(`v'),"felony")
			replace num_felonies = num_felonies+1 if strpos(lower(`v'),"felony")
		}

*****Bond amount
		replace bond_amount = subinstr(subinstr(bond_amount,"$","",.),",","",.)
		destring bond_amount, replace
		
		
*****Dates: case filed, closed, arraignment		
		foreach v of varlist date_filed date_closed c_arraignment_date1{
			gen `v'_new = date(`v',"MDY")
			format `v'_new %td
			drop `v'
			ren `v'_new `v'
		}
		
*****Events: scheduled dates, removed dates, covid dates, circuit court boundover			
		gen num_remove=0
		gen num_g1=0
		gen D_circuit=0
		gen num_remove_covid=0
		
		forvalues j=1/655{
			
		* originially scheduled dates from scheduling notice
			cap gen schedule`j'_date = date(substr(e_comment`j',1,7),"MDY",2050) if strpos(lower(e_description`j'),"scheduled") & ///
				(strpos(lower(e_comment`j'),"01")|strpos(lower(e_comment`j'),"02")|strpos(lower(e_comment`j'),"03")| ///
				strpos(lower(e_comment`j'),"04")|strpos(lower(e_comment`j'),"05")|strpos(lower(e_comment`j'),"06")| ///
				strpos(lower(e_comment`j'),"07")|strpos(lower(e_comment`j'),"08")|strpos(lower(e_comment`j'),"09")| ///
				strpos(lower(e_comment`j'),"10")|strpos(lower(e_comment`j'),"11")|strpos(lower(e_comment`j'),"12"))
				
			cap format schedule`j'_date %td
		
		* originially scheduled dates from rescheduling notice
			cap gen remove`j'_date = date(substr(e_comment`j',1,7),"MDY",2050) if strpos(lower(e_description`j'),"removed from calendar") & ///
				(strpos(lower(e_comment`j'),"01")|strpos(lower(e_comment`j'),"02")|strpos(lower(e_comment`j'),"03")| ///
				strpos(lower(e_comment`j'),"04")|strpos(lower(e_comment`j'),"05")|strpos(lower(e_comment`j'),"06")| ///
				strpos(lower(e_comment`j'),"07")|strpos(lower(e_comment`j'),"08")|strpos(lower(e_comment`j'),"09")| ///
				strpos(lower(e_comment`j'),"10")|strpos(lower(e_comment`j'),"11")|strpos(lower(e_comment`j'),"12"))
			cap format remove`j'_date %td
			
		* notice sent date
			ren event_date`j' event_date`j'_orig
			gen event_date`j' = date(event_date`j'_orig,"MDY")
			format event_date`j' %td
			
			gen remove_event_date`j'=event_date`j' if strpos(lower(e_description`j'),"removed from calendar")
			
		* number of reschedulings
			replace num_remove = num_remove+1 if strpos(lower(e_description`j'),"removed from calendar")
			replace num_remove_covid = num_remove_covid+1 if strpos(lower(e_description`j'),"removed from calendar")& ( (mofd(remove_event_date`j')>= mofd(date("11/10/2020","MDY")) & ///
					mofd(remove_event_date`j')<=mofd(date("05/10/2021","MDY")) )| ///
					(mofd(remove_event_date`j')>= mofd(date("03/17/2020","MDY")) ///
					& mofd(remove_event_date`j')<=mofd(date("06/23/2020","MDY"))))
		* whether case sent to circuit
			replace D_circuit = 1 if strpos(lower(e_description`j'),"circuit")
		}
		
		replace case_id=subinstr(case_id," ","",.)
		duplicates drop case_id, force
		
	*>1 resch per day
preserve
		
		keep case_id remove*_date
		ren remove*_date remove*
		sreshape long remove, i(case_id) j(id) missing(drop all)
		
		keep case_id remove
		duplicates drop
		bys remove: gen num=_N
		keep if num>1
		
		keep case_id
		duplicates drop
		
		gen D_remove_g1=1
		tempfile g1
		save `g1'
restore
		
		merge 1:1 case_id using `g1', nogen
		
	*>3 resch per day
preserve
		
		keep case_id remove*_date
		ren remove*_date remove*
		sreshape long remove, i(case_id) j(id) missing(drop all)
		
		keep case_id remove
		duplicates drop
		bys remove: gen num=_N
		keep if num>3
		
		keep case_id
		duplicates drop
		
		gen D_remove_g3=1
		tempfile g3
		save `g3'
restore
		
		merge 1:1 case_id using `g3', nogen
	
	*>5 resch per day
preserve
		
		keep case_id remove*_date
		ren remove*_date remove*
		sreshape long remove, i(case_id) j(id) missing(drop all)
		
		keep case_id remove
		duplicates drop
		bys remove: gen num=_N
		keep if num>5
		
		keep case_id
		duplicates drop
		
		gen D_remove_g5=1
		tempfile g5
		save `g5'
restore
		
		merge 1:1 case_id using `g5', nogen
		
	*>10 resch per day
preserve
		
		keep case_id remove*_date
		ren remove*_date remove*
		sreshape long remove, i(case_id) j(id) missing(drop all)
		
		keep case_id remove
		duplicates drop
		bys remove: gen num=_N
		keep if num>10
		
		keep case_id
		duplicates drop
		
		gen D_remove_g10=1
		tempfile g10
		save `g10'
restore
		
		merge 1:1 case_id using `g10', nogen
		
		gen case_duration = date_closed-date_filed
		
		gen year_filed = yofd(date_filed)
		gen month_filed = mofd(date_filed)
		format month_filed %tm
		
		gen pre_arr_duration = c_arraignment_date1 - date_filed
		
		gen charge_agency =  substr(c_officer1,strpos(c_officer1,"-")+1, .)
		bys judge_name: gen count_by_judge = _N
		
		gen log_duration = log(case_duration)
		gen log_pre_arr_duration = log(pre_arr_duration)
		
		encode judge_name, gen(judge_name_e)
		gen age_group = 0
		forvalues j=1/4{
			replace age_group=`j' if age_when_filed>=`j'*10 + 18
		}
		qui duplicates drop case_id, force
		
		replace D_remove_g1=0 if mi(D_remove_g1)
		replace D_remove_g3=0 if mi(D_remove_g3)
		replace D_remove_g5=0 if mi(D_remove_g5)
		replace D_remove_g10=0 if mi(D_remove_g10)
		
		ren (D_remove_g1 D_remove_g3 D_remove_g5 D_remove_g10) (D_remove_1pd D_remove_3pd D_remove_5pd D_remove_10pd)
save "$output_data/D67/ROA_cleaned_$date.dta", replace

********************************************************************************
* B. Construct rescheduling measure
********************************************************************************
	
	
		use "$output_data/D67/ROA_cleaned_$date.dta", clear

	*Get full list of case level vars
preserve
		replace case_id = subinstr(case_id," ","",.)
		duplicates drop case_id, force
		keep case_id date_filed date_closed judge_name entitlement pin ///
			case_status balance party_name party_type attorney_name bond_amount ///
			case_duration year_filed month_filed charge_agency plea_hearing_held ///
			incarceration_type minimum_term c_current_charge* c_charge_level* D_felony ///
			num_felonies num_remove D_circuit c_disposition* remove_event_date* ///
			c_sentencing_date1 c_disposition1 incarceration_type sentenced_term ///
			D_remove_1pd D_remove_3pd D_remove_5pd D_remove_10pd num_remove_covid
			
		tempfile for_merge
		save `for_merge'
restore
		replace case_id = subinstr(case_id," ","",.)
		duplicates drop case_id, force
	*Reshape data to event level
		drop event_date*_orig
		keep case_id schedule*_date remove*_date event_date* e_description* date_filed
		
		
		ren schedule*_date schedule*
		ren remove*_date remove*
		sreshape long schedule remove event_date e_description, ///
			i( case_id) j(event_id) missing(drop all)
		
		drop if mi(schedule) & mi(remove)
	
	*Get event level Rescheduling list
preserve
		replace case_id = subinstr(case_id," ","",.)
		duplicates drop case_id, force
		keep if !mi(remove)
		keep case_id remove event_date
		ren (remove event_date) (schedule first_remove_event_date)
		duplicates drop  case_id schedule , force
		tempfile reschedule_temp
		save `reschedule_temp'
restore
	
	*Get event level scheduling list
		keep if !mi(schedule)
		duplicates drop  case_id schedule , force
		
	* Merge scheduling and rescheduling by originially scheduled dates
		merge 1:1 case_id schedule using `reschedule_temp', keep(1 3)
		gen matched = _merge==3
		drop _merge
		
		drop event_id
		bys case_id (event_date): gen event_id=_n
		
	* Keep if the frist scheduled event was rescheduled
		gen D_remove_first = matched==1 & event_id==1
		replace case_id = subinstr(case_id," ","",.)
		
		
		gen cat=""
		replace cat = "Prelim Exam" if strpos(e_description,"EXAM")>0
		replace cat = "Arraignment/Pre-Trial" if strpos(e_description,"ARRAIGNMENT")>0|strpos(e_description,"PRE-TRIAL")>0
		replace cat = "Probable Cause Conference" if strpos(e_description,"PROBABLE CAUSE")>0
		replace cat = "Sentencing" if strpos(e_description,"SENTENC")>0
		replace cat = "Review" if strpos(e_description,"REVIEW")>0
		replace cat = "Showcause Hearing" if strpos(e_description,"SHOWCAUSE")>0
		replace cat = "Jury Trial" if strpos(e_description,"JURY-TRIAL")>0
		replace cat = "Other" if mi(cat)
		
		gen fel = substr(case_id,-2,2)=="FY"
		gen first = event_id==1
		gen one_m = mofd(schedule) - mofd(date_filed)<=1
		
		replace cat = "Other" if cat == "Arraignment/Pre-Trial"&fel==1
		replace cat = "Other" if cat == "Jury Trial"&fel==1
		
		replace cat = "Other" if cat == "Probable Cause Conference"&fel==0
		replace cat = "Other" if cat == "Prelim Exam" & fel==0
				
		replace case_id = subinstr(case_id," ","",.)
		duplicates drop case_id, force
		* Merge with case level data
		merge 1:1 case_id using `for_merge', nogen
		replace D_remove_first = 0 if mi(D_remove_first)
		
		drop remove matched event_id
		
		gen D_remove_any = num_remove>0
		gen D_remove_covid = num_remove_covid>0
save "$output_data/D67/ROA_for_merge.dta", replace
		

********************************************************************************
* C. Construct rescheduling measure
********************************************************************************
	
	
		use "$output_data/D67/ROA_cleaned_$date.dta", clear

		
	*Get full list of case level vars
		preserve
		replace case_id = subinstr(case_id," ","",.)
		keep case_id date_filed date_closed judge_name entitlement pin ///
			case_status balance party_name party_type attorney_name bond_amount ///
			case_duration year_filed month_filed charge_agency plea_hearing_held ///
			incarceration_type minimum_term c_current_charge* c_charge_level* D_felony ///
			num_felonies num_remove D_circuit c_disposition* remove_event_date* 
			duplicates drop case_id, force
		tempfile for_merge
		save `for_merge'
		restore
		
	*Reshape data to event level
		drop event_date*_orig
		keep case_id schedule*_date remove*_date event_date* e_description* date_filed
		
		
		ren schedule*_date schedule*
		ren remove*_date remove*
		sreshape long schedule remove event_date e_description, ///
			i( case_id) j(event_id) missing(drop all)
		
		drop if mi(schedule) & mi(remove)
	
	*Get event level Rescheduling list
		preserve
		keep if !mi(remove)
		keep case_id remove event_date
		ren (remove event_date) (schedule first_remove_event_date)
		duplicates drop  case_id schedule , force
		tempfile reschedule_temp
		save `reschedule_temp'
		restore
	
	*Get event level scheduling list
		keep if !mi(schedule)
		duplicates drop  case_id schedule , force
		
	* Merge scheduling and rescheduling by originially scheduled dates
		merge 1:1 case_id schedule using `reschedule_temp', keep(1 3)
		gen matched = _merge==3
		drop _merge
		
		drop event_id
		bys case_id (event_date): gen event_id=_n
		
	* Keep if the frist scheduled event was rescheduled
		gen D_remove_first = matched==1 & event_id==1
		replace case_id = subinstr(case_id," ","",.)
		
		
		gen cat=""
		replace cat = "Prelim Exam" if strpos(e_description,"EXAM")>0
		replace cat = "Arraignment/Pre-Trial" if strpos(e_description,"ARRAIGNMENT")>0|strpos(e_description,"PRE-TRIAL")>0
		replace cat = "Probable Cause Conference" if strpos(e_description,"PROBABLE CAUSE")>0
		replace cat = "Sentencing" if strpos(e_description,"SENTENC")>0
		replace cat = "Review" if strpos(e_description,"REVIEW")>0
		replace cat = "Showcause Hearing" if strpos(e_description,"SHOWCAUSE")>0
		replace cat = "Jury Trial" if strpos(e_description,"JURY-TRIAL")>0
		replace cat = "Other" if mi(cat)
		
		gen fel = substr(case_id,-2,2)=="FY"
		gen first = event_id==1
		gen one_m = mofd(schedule) - mofd(date_filed)<=1
		
		replace cat = "Other" if cat == "Arraignment/Pre-Trial"&fel==1
		replace cat = "Other" if cat == "Jury Trial"&fel==1
		
		replace cat = "Other" if cat == "Probable Cause Conference"&fel==0
		replace cat = "Other" if cat == "Prelim Exam" & fel==0
		
		
		gen D_remove_event = matched==1 & ((cat=="Probable Cause Conference"&fel==1)|(cat=="Arraignment/Pre-Trial"&fel==0))
		
preserve
		keep if D_remove_event==1
		bys case_id: egen min_id = min(event_id)
		drop if event_id != min_id
		duplicates drop case_id, force
		keep case_id D_remove_event schedule
		tempfile res_event
		save `res_event'
restore
		
	* Merge with case level data
		keep if D_remove_event==0
		bys case_id: egen min_id = min(event_id)
		drop if event_id != min_id
		duplicates drop case_id, force
		keep case_id D_remove_event schedule
		
		append using `res_event'
		duplicates tag case_id, gen(tag)
		drop if tag>0 & D_remove_event==0
		drop tag
		
		merge 1:1 case_id using `for_merge', nogen
		
		replace D_remove_event = 0 if mi(D_remove_event)
		
		ren schedule schedule_pcc_pret
		keep case_id D_remove_event schedule_pcc_pret
save "$output_data/D67/ROA_for_merge_event.dta", replace
		
